The settings list settings configure which parts, simulations, and analysis steps of this R markdown are executed. This way single components of the analysis can be disabled for the purpose of saving computation time.
settings <- list(
# Generate interactive plots of co-expression structures
investigate.coexpression=F,
# Perform Differential Expression Analysis
perform.DEA=T,
# Generate paper figures
generate.paper.figs=T,
# Specify if paper figures are exported
export.figures = T,
# Path for file exports
paper.fig.path = "../danawriteup/figs/",
# Clears environment
debug=TRUE
)
if(settings$debug) rm(list=ls()[ls()!="settings"])
The config list contains all parameters for the analysis. Remember to always set the working directory and the paths to the data/external files prior to running the code.
config <- list(
DANA.path = "./R/DANA.R",
## Data
case.name = "TCGA",
# Read count data file for the full data set
data.file.path = "data/TCGA_harmonized_BRCA_UCS.RData",
# RData file for coordinates data frame based on miRBase miRNA definitions
coords.file.path = "data/miRBase.coords.RData",
## Normalization Methods
# Specify which normalization methods will be applied
norm.apply.TC = TRUE,
norm.apply.UQ = TRUE,
norm.apply.med = TRUE,
norm.apply.TMM = TRUE,
norm.apply.DESeq = TRUE,
norm.apply.PoissonSeq = TRUE,
norm.apply.QN = TRUE,
norm.apply.RUV = TRUE,
# thresholds for zero-expressed, poorly-expressed and well-expressed genes
t.zero = 2, # zero-expressed in [0, t.zero)
t.poor = 5, # poorly-expressed in [t.zero, t.well]
t.well = 100, # well-expressed in [t.well, inf)
# distance threshold for clustering
cluster.threshold = 10000,
# preprocessing data transformation type: none ("n"), modified log2 ("log2"),
# or cube root ("cube") available
preprocess.transform = "log2",
## Correlation Estimation for positive and negative controls
# Available methods | Tuning parameter calibration schemes
# "mb" | "cv", "aic", "bic", "av"
# "huge.mb" | "ric", "stars"
# "glasso" | "ric", "stars", "ebic"
# "fastggm" | none
# "pearson" | none
# "spearman" | none
# Positive Controls
corr.method.pos = "mb",
tuning.criterion.pos = "bic",
# Negative Controls
corr.method.neg = "pearson",
tuning.criterion.neg = "",
# Generate plots within DANA
generate.plots = FALSE
)
Configuration for the differential expression analysis
config.DEA <- list(
## Data
case.name = "TCGA",
## Normalization Methods
# Specify which normalization methods will be applied
norm.apply.TC = TRUE,
norm.apply.UQ = TRUE,
norm.apply.med = TRUE,
norm.apply.TMM = TRUE,
norm.apply.DESeq = TRUE,
norm.apply.PoissonSeq = TRUE,
norm.apply.QN = TRUE,
norm.apply.RUV = FALSE,
# Significance level for DEA
alpha = 0.01,
# Plots
generate.volcano.plot = TRUE,
generate.meanvar.plot = TRUE,
# Use q-values (adjusted p-values) instead of p-values
q.values = FALSE,
# RUV reduces the parameter size. Reduce DEA result to RUV genes?
perform.subsetting = FALSE
)
Load all required R libraries/packages.
# DANA functions
source(config$DANA.path)
# Libraries
library(openxlsx) # read excel files
library(corrplot) # For mixed correlation plots
library(cowplot) # Arrange plots
library(RColorBrewer) # Colors
library(latex2exp) # for latex in axis labels
library(ggcorrplot) # ggplot2 correlation plots
We load the data set into memory.
# Load the p x n matrix of read counts into the workspace
load(config$data.file.path)
# Unlist the data
data.BRCA <- TCGA.BRCA.UCS$BRCA
data.UCS <- TCGA.BRCA.UCS$UCS
# Transform gene names to lower case
rownames(data.BRCA) <- tolower(rownames(data.BRCA))
rownames(data.UCS) <- tolower(rownames(data.UCS))
# Build single RC matrix
data.RC <- data.TCGA <- cbind(data.BRCA, data.UCS)
data.groups <- c(rep("BRCA", dim(data.BRCA)[2]), rep("UCS", dim(data.UCS)[2]))
# Inspect
cat("Dimensions of the data: ", dim(data.RC), "\n")
## Dimensions of the data: 1881 223
A coordinates data frame that specifies the base pair location of each miRNA of the data set on the chromosomes is loaded.
# Load miRNA chromosome location coordinates data frame
load(config$coords.file.path)
coords <- coords # change according to the name of the loaded data matrix
Some miRNAs map to multiple locations of sequence families. We named the sequence family with a parenthesis that reflects the number of members from all the different genomic locations (e.g. hsa-let-7a(3)). These miRNAs cannot be uniquely mapped to the genome, therefore we must exclude these from the location based analysis.
# only consider genes that are present in the coordinate data frame
genes.in.coords <- rownames(coords)[na.omit(match(rownames(data.RC), rownames(coords)))]
cat(dim(data.RC)[1]-length(genes.in.coords), " genes not found in coords. Reducing data set to ", length(genes.in.coords), "genes.\n")
## 33 genes not found in coords. Reducing data set to 1848 genes.
# test data set
data.RC <- data.RC[genes.in.coords, ]
# benchmark data set
genes <- rownames(data.RC)
rm(genes.in.coords)
First, we investigate the distribution of mean miRNA expression of the data.
# Histogram plot test data set
par(mfrow=c(1,1))
df <- data.frame(log.expression=log2(rowMeans(data.RC)+1))
print(ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.well+1), color="red", linetype="dashed") +
ggtitle("Test data set"))
rm(df)
We observe that there is a large proportion of poorly-expressed genes. Some of them have extremely low or zero mean expression which corresponds to essentially zero reads.
We apply the full analysis pipeline to the data set. This includes:
The following parameters are used:
genes <- rownames(data.RC)
## Apply Normalization
data.norm <- normalize(data.RC,
groups=data.groups,
name=config$case.name,
apply.TC=config$norm.apply.TC,
apply.UQ=config$norm.apply.UQ,
apply.med=config$norm.apply.med,
apply.TMM=config$norm.apply.TMM,
apply.DESeq=config$norm.apply.DESeq,
apply.PoissonSeq=config$norm.apply.PoissonSeq,
apply.QN=config$norm.apply.QN,
apply.RUV=config$norm.apply.RUV)
## 1096 genes has been filtered because they contains too small number of reads across the experiments.
## Define Controls
pre.res <- define.controls(data.RC,
t.zero=config$t.zero,
t.poor=config$t.poor,
t.well=config$t.well,
t.cluster=config$cluster.threshold,
coords=coords,
clusters=NULL)
## Number of positive control markers with RC in [100, inf): 116
## Number of negative control markers with RC in [2, 5]: 91
pos.controls <- pre.res$genes.pos
neg.controls <- pre.res$genes.neg
clusters <- pre.res$clusters
clusters.data <- clusters
# Apply DANA pipeline
DANA.results <- DANA(data.RC=data.RC,
data.norm=data.norm,
pos.controls=pos.controls,
neg.controls=neg.controls,
clusters=clusters.data,
coords=coords,
case.name="TCGA",
generate.plots=config$generate.plots,
preprocess.transform=config$preprocess.transform,
corr.method.pos=config$corr.method.pos,
tuning.criterion.pos=config$tuning.criterion.pos,
corr.method.neg=config$corr.method.neg,
tuning.criterion.neg=config$tuning.criterion.neg)
## Estimating correlations for pos. and neg. controls for data set TCGA...done
## Estimating correlations for pos. and neg. controls for data set TCGA.TMM...done
## Estimating correlations for pos. and neg. controls for data set TCGA.DESeq...done
## Estimating correlations for pos. and neg. controls for data set TCGA.PoissonSeq...done
## Estimating correlations for pos. and neg. controls for data set TCGA.TC...done
## Estimating correlations for pos. and neg. controls for data set TCGA.Med...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVg...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVs...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVr...done
## Estimating correlations for pos. and neg. controls for data set TCGA.UQ...done
## Estimating correlations for pos. and neg. controls for data set TCGA.QN...done
## Comparing models TCGA vs. TCGA.TMM....done
## Comparing models TCGA vs. TCGA.DESeq....done
## Comparing models TCGA vs. TCGA.PoissonSeq....done
## Comparing models TCGA vs. TCGA.TC....done
## Comparing models TCGA vs. TCGA.Med....done
## Comparing models TCGA vs. TCGA.RUVg....done
## Comparing models TCGA vs. TCGA.RUVs....done
## Comparing models TCGA vs. TCGA.RUVr....done
## Comparing models TCGA vs. TCGA.UQ....done
## Comparing models TCGA vs. TCGA.QN....done
if(!config$generate.plots) {
stargazer(DANA.results$metrics, type="text", summary=FALSE, digits=4,
title="Comparison statistics for the TCGA-BRCA/UCS data set", align=TRUE)
}
##
## Comparison statistics for the TCGA-BRCA/UCS data set
## =============================
## cc mscr
## -----------------------------
## TCGA.TMM 0.9904 0.7936
## TCGA.DESeq 0.9896 0.8031
## TCGA.PoissonSeq 0.9882 0.7958
## TCGA.TC 0.9927 0.6539
## TCGA.Med 0.9245 0.5716
## TCGA.RUVg 0.9854 0.8171
## TCGA.RUVs 0.8288 0.8551
## TCGA.RUVr 0.9763 0.8359
## TCGA.UQ 0.9374 0.6053
## TCGA.QN 0.9717 0.8308
## -----------------------------
iplot.data.noNorm <- iggplot.corr(DANA.results$data.model$pos$corr, clusters=clusters, title="Positive controls (un-normalized)", coords=coords)
iplot.data.TMM <- iggplot.corr(DANA.results$norm.models$TCGA.TMM$pos$corr, clusters=clusters, title="Positive controls (TMM)", coords=coords)
We compare the estimated co-expression structures for the test and benchmark dataset. Here, we take a look at the models for the un-normalized data as well as TMM-normalized data. Note that you cannot directly compare the graphs of benchmark with test data since the considered set of genes is not identical.
if(settings$investigate.coexpression) {
htmltools::tagList(list(iplot.data.noNorm, iplot.data.TMM)) # show plots in markdown
}
if(settings$perform.DEA) {
## Reset data
data.RC <- data.TCGA
## Normalize test Data
data.norm <- normalize(data.RC,
groups=data.groups,
name="TCGA",
apply.TC=config.DEA$norm.apply.TC,
apply.UQ=config.DEA$norm.apply.UQ,
apply.med=config.DEA$norm.apply.med,
apply.TMM=config.DEA$norm.apply.TMM,
apply.DESeq=config.DEA$norm.apply.DESeq,
apply.PoissonSeq=config.DEA$norm.apply.PoissonSeq,
apply.QN=config.DEA$norm.apply.QN,
apply.RUV=FALSE)
# RUV normalization
if(config.DEA$norm.apply.RUV) {
data.norm.RUV.adjust <- list(test.RUVr=norm.RUV(data.RC, data.groups, "RUVr")$adjust.factor,
test.RUVs=norm.RUV(data.RC, data.groups, "RUVs")$adjust.factor,
test.RUVg=norm.RUV(data.RC, data.groups, "RUVg")$adjust.factor)
}
}
## 1124 genes has been filtered because they contains too small number of reads across the experiments.
if(settings$perform.DEA) {
## Perform DEA on full dataset
test.DEA <- DE.voom(data.RC=data.RC, groups=data.groups, plot=config.DEA$generate.meanvar.plot, plot.title="Un-normalized")
if(config.DEA$generate.volcano.plot) plot.DE.volcano(test.DEA, alpha=config.DEA$alpha, q.values=config.DEA$q.values, title="Un-normalized")
## DEA for normalized test data
data.norm.DEA <- list()
for(i in 1:length(data.norm)) {
data.norm.DEA <-
append(data.norm.DEA, list(DE.voom(data.RC=data.norm[[i]],
groups=data.groups,
plot=config.DEA$generate.meanvar.plot,
plot.title=names(data.norm)[i])))
if(config.DEA$generate.volcano.plot) {
plot.DE.volcano(data.norm.DEA[[i]],
alpha=config.DEA$alpha,
q.values=config.DEA$q.values,
title=names(data.norm)[i])
}
}
## DEA for RUV normalization
if(config.DEA$norm.apply.RUV) {
for(i in 1:length(data.norm.RUV.adjust)) {
data.norm.DEA <-
append(data.norm.DEA, list(DE.voom(data.RC=data.RC,
groups=data.groups,
plot=config.DEA$generate.meanvar.plot,
plot.title=names(data.norm.RUV.adjust)[i],
adjust=data.norm.RUV.adjust[[i]])))
if(config.DEA$generate.volcano.plot) {
plot.DE.volcano(data.norm.DEA[[i]],
alpha=config.DEA$alpha,
q.values=config.DEA$q.values,
title=names(data.norm.RUV.adjust)[i])
}
}
names(data.norm.DEA) <- c(names(data.norm), names(data.norm.RUV.adjust))
} else {
names(data.norm.DEA) <- names(data.norm)
}
}
## number of DE genes: 1515
## number of DE genes: 1552
## number of DE genes: 1586
## number of DE genes: 1243
## number of DE genes: 597
## number of DE genes: 1198
## number of DE genes: 1134
## number of DE genes: 554
if(settings$generate.paper.figs) {
# Dimension of data after analysis
cat("Data:\n")
cat(" - Dimensions: p=", dim(data.RC)[1],", n=", dim(data.RC)[2], "\n", sep="")
cat(" - Positive controls:\n")
cat(" * Definition mean RC in interval: [", config$t.well, ", inf ]\n", sep="")
cat(" * Number of positive controls miRNAs:", length(pos.controls), "\n")
cat(" - Negative controls:\n")
cat(" * Definition mean RC in interval: [", config$t.zero, ",", config$t.poor, "]\n")
cat(" * Number of negative controls miRNAs:", length(neg.controls), "\n")
}
## Data:
## - Dimensions: p=1881, n=223
## - Positive controls:
## * Definition mean RC in interval: [100, inf ]
## * Number of positive controls miRNAs: 116
## - Negative controls:
## * Definition mean RC in interval: [ 2 , 5 ]
## * Number of negative controls miRNAs: 91
if(settings$generate.paper.figs) {
par(mfrow=c(1,1))
# Histogram plot test data set
data.RC.log2 <- log2(rowMeans(data.RC)+1)
df <- data.frame(log.expression=data.RC.log2)
p.data.RC.hist <- ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.well+1), color="red", linetype="dashed") +
ggtitle("TCGA-BRCA/UCS data set") +
theme_classic()
print(p.data.RC.hist)
p.RC.hist <- ggplot(df) +
theme_classic() +
geom_histogram(aes(x=log.expression, y=..count..), binwidth = .1) +
geom_vline(xintercept = log2(config$t.zero+1), color="#5851b8", linetype="twodash") +
geom_vline(xintercept = log2(config$t.poor+1), color="#5851b8", linetype="twodash") +
geom_vline(xintercept = log2(config$t.well+1), color="#E7298A", linetype="longdash") +
geom_segment(aes(x = log2(config$t.zero+1), y = 600, xend = log2(config$t.poor+1), yend = 600),
arrow=arrow(length=unit(.07, "in"), ends="both"),
color="#5851b8") +
annotate(geom="label", x=(log2(config$t.zero+1)+log2(config$t.poor+1))/2.0, y=630,
label="neg. contr.", color="#5851b8", size=4, fill = alpha(c("white"),0.85), label.size = NA) +
geom_segment(aes(x = log2(config$t.well+1), y = 600, xend = log2(config$t.well+1)+4, yend = 600),
arrow=arrow(length=unit(.07, "in"), ends="last"),
color="#E7298A") +
annotate(geom="label", x=log2(config$t.well+1)+.5, y=630,
label="pos. contr.", color="#E7298A", size=4, hjust=0, fill = alpha(c("white"),0.85), label.size = NA) +
xlab("Mean log2(read count)") +
ylab("Frequency")
print(p.RC.hist)
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_RC_Distribution.pdf"), p.RC.hist, width=5, height=4, device="pdf")
}
}
if(settings$generate.paper.figs) {
# Function for mean-variance plot for a given data set
mean.variance.plot <- function(data.RC, title, axis.limit=NULL) {
df <- data.frame(data.mean=log10(rowMeans(data.RC) + 1),
data.var =log10(rowVars(data.RC) + 1))
lin.fit <- lm(df$data.var ~ df$data.mean)$coefficients
p <- ggplot(df,aes(x=data.mean,y=data.var)) +
geom_point(alpha=.25) +
xlab("log10(Mean)") +
ylab("log10(Variance)") +
geom_abline(intercept = lin.fit[1], slope = lin.fit[2], color="red", linetype="longdash", size=1) +
geom_abline(intercept = 0, slope = 1, colour="blue") +
annotate("text", alpha=1, x=4, y=0.5, hjust=0, vjust=0, size=3, colour="red",
label=paste0("log10(Variance) = ", round(lin.fit[1],2), " + ", round(lin.fit[2],2), "*log10(Mean)")) +
xlim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
ylim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
ggtitle(title) +
theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) +
theme_bw()
return(p)
}
# Both subtypes
p.meanvar <- mean.variance.plot(data.RC, "Full data set", axis.limit=12.5)
print(p.meanvar)
# BRCA
p.meanvar.BRCA <- mean.variance.plot(data.RC[, data.groups=="BRCA"], "BRCA", axis.limit=12.5)
print(p.meanvar.BRCA)
# UCS
p.meanvar.UCS <- mean.variance.plot(data.RC[, data.groups=="UCS"], "UCS", axis.limit=12.5)
print(p.meanvar.UCS)
}
if(settings$generate.paper.figs) {
# Both subtypes
p.meansd <- plot.mean.sd(data.RC, config$t.zero, config$t.poor, config$t.well, "Full data set")
print(p.meansd)
# BRCA
p.meansd.BRCA <- plot.mean.sd(data.RC[, data.groups=="BRCA"], config$t.zero, config$t.poor, config$t.well, "BRCA")
print(p.meansd.BRCA)
# UCS
p.meansd.UCS <- plot.mean.sd(data.RC[, data.groups=="UCS"], config$t.zero, config$t.poor, config$t.well, "UCS")
print(p.meansd.UCS)
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_MeanSD.pdf"), p.meansd + labs(title = NULL), width = 5, height=4, device="pdf")
}
}
if(settings$generate.paper.figs) {
num.genes.plot.pos <- 60
num.genes.plot.neg <- 60
# Co-expression models
data.model <- DANA.results$data.model
data.norm.model <- DANA.results$norm.models$TCGA.DESeq
# Common control miRNAs
pos.controls <- rownames(data.model$pos$corr)
pos.controls.norm <- rownames(data.norm.model$pos$corr)
pos.controls.common <- intersect(pos.controls, pos.controls.norm)
neg.controls <- rownames(data.model$neg$corr)
neg.controls.norm <- rownames(data.norm.model$neg$corr)
# reduce the number of genes for the plot
num.genes.plot.pos <- min(num.genes.plot.pos, length(pos.controls.common))
pos.controls.plot <- pos.controls.common[1:num.genes.plot.pos]
num.genes.plot.neg <- min(num.genes.plot.neg,
dim(data.model$neg$corr)[1],
dim(data.norm.model$neg$corr)[1])
# Subsetted correlation matrices
corr.data.pos <- data.model$pos$corr[pos.controls.plot, pos.controls.plot]
corr.data.DESeq.pos <- data.norm.model$pos$corr[pos.controls.plot, pos.controls.plot]
corr.data.neg <- data.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
corr.data.DESeq.neg <- data.norm.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
## Generate plots
# Positive controls
p.corr.pos <- ggplot.corr(corr.data.pos,
clusters=clusters,
threshold=0,
title="Un-normalized",
coords=coords)
p.corr.pos.DESeq <- ggplot.corr(corr.data.DESeq.pos,
clusters=clusters,
threshold=0,
title="DESeq",
coords=coords)
p.corr.pos.two <- arrangeGrob(p.corr.pos.DESeq + theme(legend.position="none"),
p.corr.pos + theme(legend.position="none"),
get.legend(p.corr.pos.DESeq + theme(legend.position = "bottom")),
layout_matrix=rbind(c(1,2), c(4,4)),
heights = c(5,1))
grid.arrange(p.corr.pos.two) # display plot
# Negative controls
p.corr.neg <- ggplot.corr(corr.data.neg,
clusters=clusters,
threshold=0,
title="Un-normalized")
p.corr.neg.DESeq <- ggplot.corr(corr.data.DESeq.neg,
clusters=clusters,
threshold=0,
title="DESeq")
p.corr.neg.two <- arrangeGrob(p.corr.neg.DESeq + theme(legend.position="none"),
p.corr.neg + theme(legend.position="none"),
get.legend(p.corr.neg.DESeq + theme(legend.position = "bottom")),
layout_matrix=rbind(c(1,2), c(4,4)),
heights = c(5,1))
grid.arrange(p.corr.neg.two) # display plot
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrPos_DESeq.pdf"), p.corr.pos.two, width = 5, height=3.5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrNeg_DESeq.pdf"), p.corr.neg.two, width = 5, height=3.5, device="pdf")
}
}
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
if(settings$generate.paper.figs) {
p.corr.noNorm <- ggplot.corr(DANA.results$data.model$pos$corr,
clusters=clusters,
title="un-normalized")
p.corr.TMM <- ggplot.corr(DANA.results$norm.models$TCGA.TMM$pos$corr,
clusters=clusters,
title="TMM")
p.corr.DESeq <- ggplot.corr(DANA.results$norm.models$TCGA.DESeq$pos$corr,
clusters=clusters,
title="DESeq")
p.corr.PoissonSeq <- ggplot.corr(DANA.results$norm.models$TCGA.PoissonSeq$pos$corr,
clusters=clusters,
title="PoissonSeq")
p.corr.TC <- ggplot.corr(DANA.results$norm.models$TCGA.TC$pos$corr,
clusters=clusters,
title="TC")
p.corr.Med <- ggplot.corr(DANA.results$norm.models$TCGA.Med$pos$corr,
clusters=clusters,
title="Med")
p.corr.RUVg <- ggplot.corr(DANA.results$norm.models$TCGA.RUVg$pos$corr,
clusters=clusters,
title="RUVg")
p.corr.RUVs <- ggplot.corr(DANA.results$norm.models$TCGA.RUVs$pos$corr,
clusters=clusters,
title="RUVs")
p.corr.RUVr <- ggplot.corr(DANA.results$norm.models$TCGA.RUVr$pos$corr,
clusters=clusters,
title="RUVr")
p.corr.UQ <- ggplot.corr(DANA.results$norm.models$TCGA.UQ$pos$corr,
clusters=clusters,
title="UQ")
p.corr.QN <- ggplot.corr(DANA.results$norm.models$TCGA.QN$pos$corr,
clusters=clusters,
title="QN")
# Arrange plots
p.corr.all <-
plot_grid(p.corr.noNorm + theme(legend.position="none"),
p.corr.TMM + theme(legend.position="none"),
p.corr.DESeq + theme(legend.position="none"),
p.corr.PoissonSeq + theme(legend.position="none"),
p.corr.TC + theme(legend.position="none"),
p.corr.Med + theme(legend.position="none"),
p.corr.RUVg + theme(legend.position="none"),
p.corr.RUVs + theme(legend.position="none"),
p.corr.RUVr + theme(legend.position="none"),
p.corr.UQ + theme(legend.position="none"),
p.corr.QN + theme(legend.position="none"),
get.legend(p.corr.noNorm + theme(legend.position = "bottom")),
ncol=3)
plot(p.corr.all)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_Corr_All.pdf"), p.corr.all, width=9, height=12, device="pdf")
}
}
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
if(settings$generate.paper.figs) {
models <- list(
"no-normalization" = DANA.results$data.model,
TMM = DANA.results$norm.models$TCGA.TMM,
DESeq = DANA.results$norm.models$TCGA.DESeq,
PoissonSeq = DANA.results$norm.models$TCGA.PoissonSeq,
RUVg = DANA.results$norm.models$TCGA.RUVg,
RUVs = DANA.results$norm.models$TCGA.RUVs,
RUVr = DANA.results$norm.models$TCGA.RUVr,
TC = DANA.results$norm.models$TCGA.TC,
Med = DANA.results$norm.models$TCGA.Med,
UQ = DANA.results$norm.models$TCGA.UQ,
QN = DANA.results$norm.models$TCGA.QN
)
# Set number of genes for negative correlation to minimum found
num.genes.plot <- min(sapply(models, function(x) dim(x$neg$corr)[1]))
# Subsetted correlation matrices
corrs <- lapply(models, function(x) x$neg$corr[1:num.genes.plot, 1:num.genes.plot])
corrs <- lapply(corrs, function(x) x[upper.tri(x)])
control <- factor(
as.vector(sapply(names(corrs), function(x) rep(x,length(corrs[[1]])))),
levels = names(models)
)
neg.corr <- data.frame(
control=factor(control),
corr=unname(unlist(corrs))
)
# plot insert
p.insert <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
geom_vline(xintercept = 0, size=.4, color="gray80", linetype="solid") +
geom_freqpoly(binwidth=.05, alpha=.9, size=.3) +
scale_color_manual(values=brewer.pal(n=12, name="Paired")) +
theme_bw() +
xlim(-1, 1) +
theme(legend.position="none",
axis.title.x = element_blank(),
axis.title.y = element_blank())
p.corr.frequency.neg.controls.all <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
geom_vline(xintercept = 0, size=.4, color="gray80", linetype="solid") +
geom_freqpoly(binwidth=.05, alpha=.9, size=.75) +
theme_classic() +
xlim(-1, 1) +
ylim(0,600) +
scale_color_manual(values=brewer.pal(n=12, name="Paired")) +
theme(legend.position=c(0.85,0.8),
legend.title=element_blank(),
legend.background=element_rect(fill=alpha('white', 0.5))) +
ylab("Frequency") +
xlab("Marginal correlation") +
theme(legend.key.width = unit(2.4,"cm")) +
annotation_custom(ggplotGrob(p.insert), xmin=-1.1, xmax=-0.3, ymin=300, ymax=600)
print(p.corr.frequency.neg.controls.all)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrDensityNeg_Freq_All.pdf"), p.corr.frequency.neg.controls.all, width = 8, height=8, device="pdf")
}
}
## Warning: Removed 22 row(s) containing missing values (geom_path).
## Warning: Removed 22 row(s) containing missing values (geom_path).
## Warning: Removed 22 row(s) containing missing values (geom_path).
if(settings$generate.paper.figs) {
par(mfrow=c(1,1))
# un-normalized vs TMM
p.scatter.TMM <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.TMM$pos$corr,
clusters=clusters, title="TMM", xlab="un-normalized", ylab="TMM",
ccc=round(DANA.results$metrics["TCGA.TMM", "cc"],3))
# un-normalized vs DESeq
p.scatter.DESeq <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.DESeq$pos$corr,
clusters=clusters, title="DESeq", xlab="un-normalized", ylab="DESeq",
ccc=round(DANA.results$metrics["TCGA.DESeq", "cc"],3))
# un-normalized vs PoissonSeq
p.scatter.PoissonSeq <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.PoissonSeq$pos$corr,
clusters=clusters, title="PoissonSeq", xlab="un-normalized", ylab="PoissonSeq",
ccc=round(DANA.results$metrics["TCGA.PoissonSeq", "cc"],3))
# un-normalized vs TC
p.scatter.TC <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.TC$pos$corr,
clusters=clusters, title="TC", xlab="un-normalized", ylab="TC",
ccc=round(DANA.results$metrics["TCGA.TC", "cc"],3))
# un-normalized vs Med
p.scatter.Med <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.Med$pos$corr,
clusters=clusters, title="Med", xlab="un-normalized", ylab="Med",
ccc=round(DANA.results$metrics["TCGA.Med", "cc"],3))
# un-normalized vs RUVg
p.scatter.RUVg <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.RUVg$pos$corr,
clusters=clusters, title="RUVg", xlab="un-normalized", ylab="RUVg",
ccc=round(DANA.results$metrics["TCGA.RUVg", "cc"],3))
# un-normalized vs RUVs
p.scatter.RUVs <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.RUVs$pos$corr,
clusters=clusters, title="RUVs", xlab="un-normalized", ylab="RUVs",
ccc=round(DANA.results$metrics["TCGA.RUVs", "cc"],3))
# un-normalized vs RUVr
p.scatter.RUVr <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.RUVr$pos$corr,
clusters=clusters, title="RUVr", xlab="un-normalized", ylab="RUVr",
ccc=round(DANA.results$metrics["TCGA.RUVr", "cc"],3))
# un-normalized vs UQ
p.scatter.UQ <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.UQ$pos$corr,
clusters=clusters, title="UQ", xlab="un-normalized", ylab="UQ",
ccc=round(DANA.results$metrics["TCGA.UQ", "cc"],3))
# un-normalized vs QN
p.scatter.QN <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.QN$pos$corr,
clusters=clusters, title="QN", xlab="un-normalized", ylab="QN",
ccc=round(DANA.results$metrics["TCGA.QN", "cc"],3))
p.scatter.TCGA.all <- plot_grid(p.scatter.TMM,
p.scatter.DESeq,
p.scatter.PoissonSeq,
p.scatter.TC,
p.scatter.Med,
p.scatter.RUVg,
p.scatter.RUVs,
p.scatter.RUVr,
p.scatter.UQ,
p.scatter.QN,
ncol = 3,
align="hv")
plot(p.scatter.TCGA.all)
# Save plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_Scatter_All.pdf"), p.scatter.TCGA.all, width=9, height=12, device="pdf")
}
}
## Warning in plot.corr.scatter(corr1 = DANA.results$data.model$pos$corr, corr2 = DANA.results$norm.models$TCGA.RUVs$pos$corr, : Warning: Shapes of correlation matrices in plot.corr.scatter do not match!
## Reducing correlation matrices to all mutual genes.
if(settings$generate.paper.figs) {
options(scipen=4) # force non-scientific notation of x axis
test.DANA.metrics <- data.frame(
method=substr(rownames(DANA.results$metrics), 6, 20),
cc=DANA.results$metrics[, "cc"],
mscr=DANA.results$metrics[, "mscr"]
)
p.DANA.metrics <- ggplot(test.DANA.metrics,aes(x=mscr,y=cc, label=method)) +
geom_point(alpha=.5) +
theme_classic() +
xlab(TeX("$mscr^-$; Relative reduction of handling effects")) +
ylab(TeX("$cc^+$; Biological signal preservation")) +
geom_text_repel(aes(label = method), size=3, max.overlaps = Inf, min.segment.length = 0.3) +
# xlim(c(min(c(0, test.mposc.reduction)),max(test.mposc.reduction))) +
scale_x_continuous(labels = scales::percent, limits=c(0,1.05),
breaks = scales::pretty_breaks(n = 5)) +
# ylim(c(0,1))
scale_y_continuous(labels = scales::percent, limits = c(0.5,1))
print(p.DANA.metrics)
print("DANA Statistics for the TCGA-BRCA/UCS")
test.DANA.metrics$cc <- round(test.DANA.metrics$cc,3)
test.DANA.metrics$mscr <- round(test.DANA.metrics$mscr,3)
print(test.DANA.metrics)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_DANAMetrics.pdf"), p.DANA.metrics, width=4.5, height=3.5, device="pdf")
}
}
## [1] "DANA Statistics for the TCGA-BRCA/UCS"
## method cc mscr
## 1 TMM 0.990 0.794
## 2 DESeq 0.990 0.803
## 3 PoissonSeq 0.988 0.796
## 4 TC 0.993 0.654
## 5 Med 0.924 0.572
## 6 RUVg 0.985 0.817
## 7 RUVs 0.829 0.855
## 8 RUVr 0.976 0.836
## 9 UQ 0.937 0.605
## 10 QN 0.972 0.831
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
##
## attached base packages:
## [1] splines stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggcorrplot_0.1.3 latex2exp_0.5.0
## [3] RColorBrewer_1.1-2 cowplot_1.1.1
## [5] openxlsx_4.2.4 ffpe_1.38.0
## [7] TTR_0.24.2 DescTools_0.99.44
## [9] vsn_3.62.0 RUVSeq_1.28.0
## [11] EDASeq_2.28.0 ShortRead_1.52.0
## [13] GenomicAlignments_1.30.0 Rsamtools_2.10.0
## [15] Biostrings_2.62.0 XVector_0.34.0
## [17] sva_3.42.0 BiocParallel_1.28.2
## [19] genefilter_1.76.0 mgcv_1.8-38
## [21] nlme_3.1-153 PoissonSeq_1.1.2
## [23] combinat_0.0-8 DESeq2_1.34.0
## [25] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [27] MatrixGenerics_1.6.0 matrixStats_0.61.0
## [29] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0
## [31] IRanges_2.28.0 S4Vectors_0.32.3
## [33] BiocGenerics_0.40.0 edgeR_3.36.0
## [35] limma_3.50.0 FastGGM_1.0
## [37] RcppParallel_5.1.4 Rcpp_1.0.7
## [39] huge_1.3.5 glmnet_4.1-3
## [41] Matrix_1.3-4 ggrepel_0.9.1
## [43] plotly_4.10.0 stargazer_5.2.2
## [45] corrplot_0.92 ggnewscale_0.4.5
## [47] gridExtra_2.3 ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 R.utils_2.11.0
## [3] tidyselect_1.1.1 RSQLite_2.2.9
## [5] AnnotationDbi_1.56.2 htmlwidgets_1.5.4
## [7] grid_4.1.2 munsell_0.5.0
## [9] codetools_0.2-18 preprocessCore_1.56.0
## [11] nleqslv_3.3.2 withr_2.4.3
## [13] colorspace_2.0-2 filelock_1.0.2
## [15] highr_0.9 knitr_1.36
## [17] rstudioapi_0.13 labeling_0.4.2
## [19] GenomeInfoDbData_1.2.7 hwriter_1.3.2
## [21] farver_2.1.0 bit64_4.0.5
## [23] rhdf5_2.38.0 vctrs_0.3.8
## [25] generics_0.1.1 xfun_0.28
## [27] BiocFileCache_2.2.0 R6_2.5.1
## [29] illuminaio_0.36.0 locfit_1.5-9.4
## [31] reshape_0.8.8 bitops_1.0-7
## [33] rhdf5filters_1.6.0 cachem_1.0.6
## [35] DelayedArray_0.20.0 assertthat_0.2.1
## [37] BiocIO_1.4.0 scales_1.1.1
## [39] rootSolve_1.8.2.3 gtable_0.3.0
## [41] affy_1.72.0 methylumi_2.40.1
## [43] lmom_2.8 rlang_0.4.12
## [45] rtracklayer_1.54.0 lazyeval_0.2.2
## [47] GEOquery_2.62.1 BiocManager_1.30.16
## [49] yaml_2.2.1 crosstalk_1.2.0
## [51] GenomicFeatures_1.46.1 tools_4.1.2
## [53] nor1mix_1.3-0 affyio_1.64.0
## [55] ellipsis_0.3.2 lumi_2.46.0
## [57] jquerylib_0.1.4 siggenes_1.68.0
## [59] proxy_0.4-26 plyr_1.8.6
## [61] sparseMatrixStats_1.6.0 progress_1.2.2
## [63] zlibbioc_1.40.0 purrr_0.3.4
## [65] RCurl_1.98-1.5 prettyunits_1.1.1
## [67] openssl_1.4.5 bumphunter_1.36.0
## [69] zoo_1.8-9 sfsmisc_1.1-12
## [71] magrittr_2.0.1 data.table_1.14.2
## [73] mvtnorm_1.1-3 aroma.light_3.24.0
## [75] hms_1.1.1 evaluate_0.14
## [77] xtable_1.8-4 XML_3.99-0.8
## [79] jpeg_0.1-9 mclust_5.4.8
## [81] shape_1.4.6 compiler_4.1.2
## [83] biomaRt_2.50.1 minfi_1.40.0
## [85] tibble_3.1.6 KernSmooth_2.23-20
## [87] crayon_1.4.2 R.oo_1.24.0
## [89] htmltools_0.5.2 tzdb_0.2.0
## [91] tidyr_1.1.4 geneplotter_1.72.0
## [93] expm_0.999-6 Exact_3.1
## [95] DBI_1.1.1 dbplyr_2.1.1
## [97] MASS_7.3-54 rappdirs_0.3.3
## [99] boot_1.3-28 readr_2.1.1
## [101] quadprog_1.5-8 R.methodsS3_1.8.1
## [103] parallel_4.1.2 igraph_1.2.9
## [105] pkgconfig_2.0.3 xml2_1.3.3
## [107] foreach_1.5.1 annotate_1.72.0
## [109] rngtools_1.5.2 multtest_2.50.0
## [111] beanplot_1.2 doRNG_1.8.2
## [113] scrime_1.3.5 stringr_1.4.0
## [115] digest_0.6.29 base64_2.0
## [117] rmarkdown_2.11 gld_2.6.3
## [119] DelayedMatrixStats_1.16.0 restfulr_0.0.13
## [121] curl_4.3.2 rjson_0.2.20
## [123] lifecycle_1.0.1 jsonlite_1.7.2
## [125] Rhdf5lib_1.16.0 askpass_1.1
## [127] viridisLite_0.4.0 fansi_0.5.0
## [129] pillar_1.6.4 lattice_0.20-45
## [131] KEGGREST_1.34.0 fastmap_1.1.0
## [133] httr_1.4.2 survival_3.2-13
## [135] glue_1.5.1 xts_0.12.1
## [137] zip_2.2.0 png_0.1-7
## [139] iterators_1.0.13 bit_4.0.4
## [141] class_7.3-19 stringi_1.7.6
## [143] HDF5Array_1.22.1 blob_1.2.2
## [145] latticeExtra_0.6-29 memoise_2.0.1
## [147] dplyr_1.0.7 e1071_1.7-9